
global username = c(username)
global dropbox "/Users/${username}/Dropbox"
global master_file_path "${dropbox}/Engel_GFT/replication_files"

global input "$master_file_path\data/intermediate_data"
global output "$master_file_path\data/intermediate_data/conventional_price_indices"

cd "$master_file_path\data\intermediate_data"
set more off

**************************************************************
****      Part 1: Consumption within each round        *******
**************************************************************

if 1==1 {
	
  *(1) round 43
   set more off


    use "R43R55/expenditures_item_level_R43R55_i_groupsV1.dta" if round==43, clear

    *drop households with flagged discrepancies between sum of exp. and mpce
    drop if flag_mpce==1

    *drop goods with no quantity data
    drop if quantity==.
    drop if quantity==0

    *Keep only goods in our 34 i groupings:
    drop if flag_deaton_drops==1

	replace concorded_itemlabel2=trim(concorded_itemlabel2)

	*Get rid of products beyond fuel and light:
	drop if round==43 & itemcode43>=480
	drop if round==55 & Srl_no_of_item>=360

    *calculate expenditures and quantities at the hhid X item level:
    collapse (sum) expenditure quantity, by(state43 district43 sector wt hhsize mpce hhid concorded_itemlabel2 i_groupsV1 g_groupsV3)

    gen price=expenditure/quantity
    replace price=round(price, 0.01)

    *Angus' automatic test for unit price outliers:
    gen log_price=log(price)
    egen sd_log_price=sd(log_price), by(concorded_itemlabel2)
    egen mean_log_price=mean(log_price), by(concorded_itemlabel2)
    gen diff=abs(log_price-mean_log_price)
    gen compare=0
    replace compare=1 if diff>2*sd_log_price
    drop if compare==1
    drop compare diff mean_log_price sd_log_price log_price

    bysort concorded_itemlabel2 state43 sector: egen price_state=median(price)
    replace price_state=round(price_state, 0.01)

    bysort concorded_itemlabel2 state43 district43 sector: egen price_district=median(price)
    replace price_district=round(price_district, 0.01)
	
    egen item_seq = group(concorded_itemlabel2), label
    egen i_groupsV1_seq=group(i_groupsV1), label

    preserve
    
	duplicates drop item_seq concorded_itemlabel2 i_groupsV1 g_groupsV3, force
    keep concorded_itemlabel2 item_seq i_groupsV1 i_groupsV1_seq
    save "R43R55\item_seq_igroupsV1.dta", replace

    *Household level again here:
    restore

    keep expenditure quantity price_state price_district state43 sector district43 hhid wt hhsize mpce item_seq i_groupsV1_seq

    egen sum_exp=total(expenditure), by(state43 sector district43 hhid)

    *Household level dataset with expenditures and prices at the item level:

    gen igroupsV1_itemseq=i_groupsV1_seq*10000+item_seq

    reshape wide expenditure quantity price_state price_district , i(state43 district43 sector wt hhsize mpce hhid sum_exp i_groupsV1 item_seq) j(igroupsV1_itemseq)

    *Fill in with 0 in case of no purchases:
         foreach var of varlist expenditure* quantity* {
	     replace `var'=0 if `var'==.
		 }

    *Create district level budget shares (plutocratic):
        *Total expenditure by district:
         bysort state43 sector district43: egen tot_exp_dist=total(sum_exp*wt)

         *Expenditure on each good by district and sector:
         foreach var of varlist expenditure* {
         loc a=substr("`var'",12,.)
         egen exp_`a'=total(`var'*wt) ,  by(state43 sector district43)
         gen pluto_share_`a'=exp_`a'/tot_exp_dist

		 }
         drop exp_* tot_exp_dist

    *Create district level budget shares (democratic):
         *Average hhd share on each good by district:
         foreach var of varlist expenditure* {
         loc a=substr("`var'",12,.)
         gen aux_`a'=(`var'/sum_exp)
         egen demo_share_`a'=wtmean(aux_`a') , weight(wt)  by(state43 sector district43)
		 }

    *fill district level prices:
  foreach var of varlist price_district* {
     loc a=substr("`var'",15,.)
	*can use the max since they are all the same:
    bysort state43 sector district43: egen price_dist_`a'=max(`var')
	 cap drop `var'
	 }


    *fill state level prices:
  foreach var of varlist price_state* {
     loc a=substr("`var'",12,.)
	*can use the max since they are all the same:
    bysort state43 sector : egen price_st_`a'=max(`var')
	 cap drop `var'
	 }


    egen dist_wt=total(wt), by(state43 sector district43)

    duplicates drop state43 sector district43, force


   keep state43 sector district43 price_dist_* pluto_share_* demo_share_* price_st_* dist_wt

   save "$output/R43_district_level_V1_D", replace

*end of if 1==1:
}






if 1==1 {
	
  *(1) round 55
   set more off


    use "R43R55/expenditures_item_level_R43R55_i_groupsV1.dta" if round==55, clear

    *drop households with flagged discrepancies between sum of exp. and mpce
    drop if flag_mpce==1

    *drop goods with no quantity data
    drop if quantity==.
    drop if quantity==0

    *Keep only goods in our 34 i groupings:
    drop if flag_deaton_drops==1

	replace concorded_itemlabel2=trim(concorded_itemlabel2)

	*Make list of goods consistent over rounds:
	*note: other cereals only drops in R55:
    drop if concorded_itemlabel2=="Other cereals"


	*Get rid of products beyond fuel and light:
	drop if round==43 & itemcode43>=480
	drop if round==55 & Srl_no_of_item>=360

	replace quantity=quantity/100
	replace quantity=round(quantity, 0.01)

    *calculate expenditures and quantities at the hhid X item level:
    collapse (sum) expenditure quantity, by(state43 district43 sector wt hhsize mpce hhid concorded_itemlabel2 i_groupsV1 g_groupsV3)

    gen price=expenditure/quantity
    replace price=round(price, 0.01)

    *Angus' automatic test for unit price outliers:
    gen log_price=log(price)
    egen sd_log_price=sd(log_price), by(concorded_itemlabel2)
    egen mean_log_price=mean(log_price), by(concorded_itemlabel2)
    gen diff=abs(log_price-mean_log_price)
    gen compare=0
    replace compare=1 if diff>2*sd_log_price
    drop if compare==1
    drop compare diff mean_log_price sd_log_price log_price

    bysort concorded_itemlabel2 state43 sector: egen price_state=median(price)
    replace price_state=round(price_state, 0.01)

    bysort concorded_itemlabel2 state43 district43 sector: egen price_district=median(price)
    replace price_district=round(price_district, 0.01)
	
    egen item_seq = group(concorded_itemlabel2), label
    egen i_groupsV1_seq=group(i_groupsV1), label

    keep expenditure quantity price_state price_district state43 sector district43 hhid wt hhsize mpce item_seq i_groupsV1_seq

    egen sum_exp=total(expenditure), by(state43 sector district43 hhid)

    *Household level dataset with expenditures and prices at the item level:

    gen igroupsV1_itemseq=i_groupsV1_seq*10000+item_seq

        reshape wide expenditure quantity price_state price_district , i(state43 district43 sector wt hhsize mpce hhid sum_exp i_groupsV1 item_seq) j(igroupsV1_itemseq)

    *Fill in with 0 in case of no purchases:
         foreach var of varlist expenditure* quantity* {
	     replace `var'=0 if `var'==.
		 }

    *Create district level budget shares (plutocratic):
        *Total expenditure by district:
         bysort state43 sector district43: egen tot_exp_dist=total(sum_exp*wt)

         *Expenditure on each good by district and sector:
         foreach var of varlist expenditure* {
         loc a=substr("`var'",12,.)
         egen exp_`a'=total(`var'*wt) ,  by(state43 sector district43)
         gen pluto_share_`a'=exp_`a'/tot_exp_dist

		 }
         drop exp_* tot_exp_dist

    *Create district level budget shares (democratic):
         *Average hhd share on each good by district:
         foreach var of varlist expenditure* {
         loc a=substr("`var'",12,.)
         gen aux_`a'=(`var'/sum_exp)
         egen demo_share_`a'=wtmean(aux_`a') , weight(wt)  by(state43 sector district43)
		 }


    *fill district level prices:
  foreach var of varlist price_district* {
     loc a=substr("`var'",15,.)
	*can use the max since they are all the same:
    bysort state43 sector district43: egen price_dist_`a'=max(`var')
	 cap drop `var'
	 }


    *fill state level prices:
   foreach var of varlist price_state* {
     loc a=substr("`var'",12,.)
	*can use the max since they are all the same:
    bysort state43 sector : egen price_st_`a'=max(`var')
	 cap drop `var'
	 }


    egen dist_wt=total(wt), by(state43 sector district43)

    duplicates drop state43 sector district43, force


   keep state43 sector district43 price_dist_* pluto_share_* demo_share_* price_st_* dist_wt

   save "$output/R55_district_level_V1_D", replace


*end of 1==1:
}





















**************************************************************
****        Part 2: Construct CPI: across rounds       *******
****
*State level (Paasche and Laspeyres, plutocratic and democratic)
*District level (Paasche and Laspeyres, plutocratic and democratic)
**************************************************************


**************************************************************
*(1) District level (Paasche and Laspeyres, plutocratic and democratic), R43, R55 for 34 i items
**************************************************************

if 1==1{
local round1 "43"
local round2 "55"
forvalues i=1/1 {
local initial: word `i' of `round1'
local final: word `i' of `round2'

   set more off
	 use  "$output/R`final'_district_level_V1_D", clear

*Recall we are using median prices at district or state level.
*Replace median P with log(median P) so we can calculate changes:

    foreach var of varlist price_dist_*  {
    local i=substr("`var'",12,6)	
    qui gen  ln_`var' =log(price_dist_`i')
    drop `var'
    }

    foreach var of varlist  price_st_* {
    local i=substr("`var'",10,6)	
    qui gen ln_`var'=log(price_st_`i')
    drop `var'
    }

    order sector state43 district43 dist_wt
    *rename with round first:
	foreach var of varlist pluto_share_10005-ln_price_st_340116 {
	    rename `var' _`final'_`var'
		}

	merge 1:1 state43 sector district43 using "$output/R`initial'_district_level_V1_D"

    *782 districts match:
    keep if _merge==3
	drop _merge

    foreach var of varlist price_dist_*  {
    local i=substr("`var'",12,6)	
    qui gen  ln_`var' =log(price_dist_`i')
    drop `var'
    }

    foreach var of varlist  price_st_* {
    local i=substr("`var'",10,6)	
    qui gen ln_`var'=log(price_st_`i')
    drop `var'
    }

    order sector state43 district43 dist_wt
    *rename with round first:
	foreach var of varlist pluto_share_10005-ln_price_st_340116 {
	    rename `var' _`initial'_`var'
		}

*Calculate log price differences:

	 foreach var of varlist _`initial'_ln_price_dist_* {
	   loc a=substr("`var'",19,.)
	    qui gen d_ln_p_dist_`final'_`initial'_`a'= _`final'_ln_price_dist_`a'-_`initial'_ln_price_dist_`a'

        *replace with state price change if missing at district level:
        qui replace d_ln_p_dist_`final'_`initial'_`a'=_`final'_ln_price_st_`a'-_`initial'_ln_price_st_`a' if d_ln_p_dist_`final'_`initial'_`a'==.

        *replace with national price change if missing at state level:
        qui egen national_final=median(_`final'_ln_price_st_`a')
        qui egen national_initial=median(_`initial'_ln_price_st_`a')
        qui replace d_ln_p_dist_`final'_`initial'_`a'=national_final-national_initial if d_ln_p_dist_`final'_`initial'_`a'==.
        drop national_initial national_final _`final'_ln_price_dist_`a' _`initial'_ln_price_dist_`a' _`final'_ln_price_st_`a' _`initial'_ln_price_st_`a'
	   }


*Calculate weighted average change in prices for the 34 i's:
*This is done in two cases separately, for item i from 1-9 and item i- 10-34 (2 digits):

    tempfile dprices
    save `dprices'

    foreach var of varlist  _43_demo_share_* {
    use `dprices',clear
    if regexm("`var'","_[0-9][0-9][0-9][0-9][0-9]$")==1 {
    local i=substr("`var'",16,1)
    local item=substr("`var'",18,3)

    *This drop variables that have 6 digits than `i'0`item':
        foreach var2 of varlist  _43_demo_share_* {
        if regexm("`var2'","_[0-9][0-9][0-9][0-9][0-9]$")==0 {
        local d=substr("`var2'",16,.)
        drop _*_*_share_`d' d_ln_p_dist_*_*_`d'
        }
        }

    *Keep only items in `i' (should only have variables with 5 digits here):
    keep sector-dist_wt  _*_*_share_`i'0*    d_ln_p_dist_*_*_`i'0*
    gen good_i=`i'
    order good_i, after(dist_wt)

    *gen sum of budget shares (these are only the items that are components of good i):
    egen sum_demo_shares_`i'=rowtotal(_43_demo_share_*), mis
    egen sum_pluto_shares_`i'=rowtotal(_43_pluto_share_*), mis

	*Calculate weighted averages (using initial shares): 
*MGN Note 10/17/2020: I think we should have used Tornqvist, which weight d log price changes by the average share (See Deaton 2003 pg 364)
    foreach var3 of varlist d_ln_p_dist_`final'_`initial'_`i'0* {
	   loc a=substr("`var3'",21,.)
	   gen aux_d_ln_p_dist_`final'_`initial'_`i'0`a'=`var3'*(_`initial'_pluto_share_`i'0`a')
        gen aux_d_ln_d_dist_`final'_`initial'_`i'0`a'=`var3'*(_`initial'_demo_share_`i'0`a')
	   }
    egen sum_d_ln_p_dist_`final'_`initial'_`i'=rowtotal(aux_d_ln_p_dist_`final'_`initial'_`i'0*), mis
    egen sum_d_ln_d_dist_`final'_`initial'_`i'=rowtotal(aux_d_ln_d_dist_`final'_`initial'_`i'0*), mis

    gen d_ln_p_pluto_`final'_`initial'_`i'=sum_d_ln_p_dist_`final'_`initial'_`i'/sum_pluto_shares_`i'
    gen d_ln_p_demo_`final'_`initial'_`i'=sum_d_ln_d_dist_`final'_`initial'_`i'/sum_demo_shares_`i'
}


    *The case in which the variable name refers to item i from 10-34:
    if regexm("`var'","_[0-9][0-9][0-9][0-9][0-9][0-9]$")==1 {
    local i=substr("`var'",16,2)
    local item=substr("`var'",19,3)

    *This drop variables that have 5 digits than `i'0`item':
        foreach var2 of varlist  _43_demo_share_* {
        if regexm("`var2'","[0-9][0-9][0-9][0-9][0-9][0-9]$")==0 {
        local d=substr("`var2'",16,.)
        drop _*_*_share_`d' d_ln_p_dist_*_*_`d'
        }
        }

    *Keep only items in `i' (should only have variables with 6 digits here):
    keep sector-dist_wt  _*_*_share_`i'0*    d_ln_p_dist_*_*_`i'0*
    gen good_i=`i'
    order good_i, after(dist_wt)

    *gen sum of budget shares (these are only the items that are components of good i):
    egen sum_demo_shares_`i'=rowtotal(_43_demo_share_*), mis
    egen sum_pluto_shares_`i'=rowtotal(_43_pluto_share_*), mis

	*Calculate weighted averages:
    foreach var3 of varlist d_ln_p_dist_`final'_`initial'_`i'0* {
	   loc a=substr("`var3'",22,.)
	   gen aux_d_ln_p_dist_`final'_`initial'_`i'0`a'=`var3'*(_`initial'_pluto_share_`i'0`a')
        gen aux_d_ln_d_dist_`final'_`initial'_`i'0`a'=`var3'*(_`initial'_demo_share_`i'0`a')
	   }
    egen sum_d_ln_p_dist_`final'_`initial'_`i'=rowtotal(aux_d_ln_p_dist_`final'_`initial'_`i'0*), mis
    egen sum_d_ln_d_dist_`final'_`initial'_`i'=rowtotal(aux_d_ln_d_dist_`final'_`initial'_`i'0*), mis

    gen d_ln_p_pluto_`final'_`initial'_`i'=sum_d_ln_p_dist_`final'_`initial'_`i'/sum_pluto_shares_`i'
    gen d_ln_p_demo_`final'_`initial'_`i'=sum_d_ln_d_dist_`final'_`initial'_`i'/sum_demo_shares_`i'
}
    keep state43 sector district43 dist_wt good_i d_ln_p_demo_`final'_`initial'_`i' d_ln_p_pluto_`final'_`initial'_`i'  sum_demo_shares_`i'  sum_pluto_shares_`i'
    tempfile d_ln_p_`final'_`initial'_item_`i'
    qui save `d_ln_p_`final'_`initial'_item_`i'', replace
    display "item `i' done"
}

*end round1 round2:
}


*Assemble all `i' datasets:

use `d_ln_p_55_43_item_1',clear
forvalues j=2/34{
append using `d_ln_p_55_43_item_`j''
}
label var good_i "i goods V3 1-34"
label var sum_demo_shares_1 "sum of hhd demo budget shares of items in i"
label var sum_pluto_shares_1 "sum of hhd pluto budget shares of items in i"
label var d_ln_p_pluto_55_43_1 "change in log price of good i (weighted by initial shares)"
label var d_ln_p_demo_55_43_1 "change in log price of good i (weighted by initial shares)"

*Some i-level goods are blank because there is no consumption in the district of any of the underlying individual items. Replace these cases with the average log price change in the state or country.
forvalues k=1/34 {
bysort state43 sector: egen state_d_ln_p_pluto_55_43_`k'=mean(d_ln_p_pluto_55_43_`k')
bysort sector: egen nat_d_ln_p_pluto_55_43_`k'=mean(d_ln_p_pluto_55_43_`k')

bysort state43 sector: egen state_d_ln_p_demo_55_43_`k'=mean(d_ln_p_demo_55_43_`k')
bysort sector: egen nat_d_ln_p_demo_55_43_`k'=mean(d_ln_p_demo_55_43_`k')

replace d_ln_p_pluto_55_43_`k'= state_d_ln_p_pluto_55_43_`k' if sum_demo_shares_`k'==0 & d_ln_p_pluto_55_43_`k'==.
replace d_ln_p_pluto_55_43_`k'= nat_d_ln_p_pluto_55_43_`k' if sum_demo_shares_`k'==0 & d_ln_p_pluto_55_43_`k'==.

replace d_ln_p_demo_55_43_`k'= state_d_ln_p_demo_55_43_`k' if sum_demo_shares_`k'==0 & d_ln_p_demo_55_43_`k'==.
replace d_ln_p_demo_55_43_`k'= nat_d_ln_p_demo_55_43_`k' if sum_demo_shares_`k'==0 & d_ln_p_demo_55_43_`k'==.

drop nat_d_ln_p_pluto_55_43_`k' state_d_ln_p_pluto_55_43_`k' nat_d_ln_p_demo_55_43_`k' state_d_ln_p_demo_55_43_`k'
}



save "$output/d_ln_p_goods_i_55_43.dta", replace

*end if 1==2:
}




**************************************************************
*Table for relative price changes across round for 34 i items
**************************************************************

*First prepare list of districts with more than 100 households (our main sample)-this bit of code is taken from Create_Dataset_for_Jay_Sayre.do:
cd $input
use "R43R55/expenditures_R43R55_i_groupsV1_D.dta" , clear
drop g_groupsV1 g_groupsV2 g_groupsV3
rename i_groupsV1 i_products
*Merge in G groupings from baseline specification:
merge m:1 i_products using "$master_file_path/data/intermediate_data/Alternative_Gs/G_Groupings_108.dta", keepusing(G_15)

drop _merge
*Drop urban sample:
gen xsector=1 if sector=="Rural"
replace xsector=2 if sector=="Urban"
drop sector
rename xsector sector
drop if sector!=1
*These dont actually drop anything:
drop if wt==0  | wt==.
drop if hhid==.
drop if state43==. | district43==.
drop if sector==.
*Add in extra variables:
merge m:1 hhid using "$master_file_path/data/intermediate_data\hh_shares\hh_43_55V1_D_G108_Ben.dta"
*Drop abnormal mpce observations:
drop if flag_mpce==1
drop _merge
drop if count_hh<100

keep  state43 district43
duplicates drop state43 district43, force
tempfile districts
save `districts'


use "$output/d_ln_p_goods_i_55_43.dta", clear
keep if sector=="Rural"
*This is for all 431 districts. Should we only report this for districts with more than 100 households in sample? YES, done in the code taken from Jay above.

merge m:1 state43 district43 using `districts'
keep if _merge==3
drop _merge


*First report mean and sd of price changes of 34 goods:
gen exp_mean_d_ln_p_demo=.
gen exp_sd_d_ln_p_demo=.

*11/7/20: To make this equal to what we do elsewhere in the paper (see email with Ben): 
*"On Tue, Oct 27, 2020, 14:38 Benjamin Faber <benjam.faber@gmail.com> wrote: exponentiate and minus 1 if we want to show percentage changes"

local i=1
while `i'<35 {
 sum d_ln_p_demo_55_43_`i' [weight=dist_wt]
 local aux=(exp(r(mean))-1)*100
 local aux2=(exp(r(sd))-1)*100
 replace exp_mean_d_ln_p_demo=`aux' if _n==`i'
 replace exp_sd_d_ln_p_demo=`aux2' if _n==`i'
 local i=`i'+1
}
*These are the means and sd's of dlog price changes over time:
preserve
keep exp_mean_d_ln_p_demo exp_sd_d_ln_p_demo
export excel "$output/34_igood_mean_and_sd_price_changes.xlsx", first(var)  keepcellfmt  replace
restore

*Did not use this in the table in the end:
/*

drop mean_d_ln_p_demo sd_d_ln_p_demo

*Next report average and sd's of relative price changes:
	*First fill in price changes within district:
local i=1
while `i'<35 {
	egen aux_`i'=max(d_ln_p_demo_55_43_`i'),by(sector state43 district43)
	replace d_ln_p_demo_55_43_`i'=aux_`i'
	drop aux_`i'
	local i=`i'+1
}	


local i=1
while `i'<35 {
	local j=`i'+1
	while `j'<35 {
		*This is log(P^1_i/P^1_j)-log(P^0_i/P^0_j) but rearranged:
		gen rel_price_change_`i'_`j'=d_ln_p_demo_55_43_`i'-d_ln_p_demo_55_43_`j'
		local j=`j'+1
		}

		*we will put summary stats here in the next step:
		gen mean_rel_p_change_demo_`i'=.
		gen sd_rel_p_change_demo_`i'=.

		local i=`i'+1
}

	*keep one observation per district:
	keep if good_i==1


	*Next calculate mean and sd of relative prices across district for each of the 34 i goods (population weighted): 

local i=1
while `i'<35 {
	local j=`i'+1
	while `j'<35 {
		sum rel_price_change_`i'_`j'  [weight=dist_wt]
		replace mean_rel_p_change_demo_`i'=r(mean) if _n==`j'
		replace sd_rel_p_change_demo_`i'=r(sd) if _n==`j'
		local j=`j'+1
		}
	local i=`i'+1
}

order mean_rel_p_change_demo_* sd_rel_p_change_demo_*, after(rel_price_change_33_34)
	

*These are the means and sd's of relative price changes over time:
keep mean_rel_p_change_demo_* sd_rel_p_change_demo_*
export excel "$output/34_igood_relative_price_changes.xlsx", first(var)  keepcellfmt  replace

*/
